Data loading and library

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)
westwood <- read.csv("westwood.csv")
head(westwood) # make sure data is loaded
##                                                 name host_since
## 1 Private Master+Bath, Gym+Amenities; UCLA/Rotations 2013-03-20
## 2                     Stylish 1BD Westwood Apartment 2011-11-09
## 3   Private Master bedroom by UCLA and century city- 2013-08-06
## 4    Small 1 Bedroom Loft Near UCLA on Westwood Blvd 2012-08-27
## 5                       Vintage cozy room in West LA 2014-01-07
## 6   Spacious and Private 2Bed/2 Bathroom in Westwood 2014-01-08
##   host_response_time host_response_rate host_acceptance_rate host_is_superhost
## 1       within a day                67%                  50%                 f
## 2                N/A                N/A                  86%                 f
## 3 within a few hours               100%                 100%                 f
## 4     within an hour                99%                  77%                 f
## 5     within an hour               100%                 100%                 f
## 6 within a few hours               100%                  57%                 f
##   host_neighbourhood host_total_listings_count
## 1           Westwood                         1
## 2           Westwood                         1
## 3                                            1
## 4           Westside                        40
## 5           Westwood                         1
## 6           Westwood                         1
##                                                                                                          host_verifications
## 1                                                                        ['email', 'phone', 'reviews', 'kba', 'work_email']
## 2 ['email', 'phone', 'facebook', 'reviews', 'jumio', 'offline_government_id', 'selfie', 'government_id', 'identity_manual']
## 3                                                              ['email', 'phone', 'google', 'reviews', 'kba', 'work_email']
## 4                       ['email', 'phone', 'reviews', 'offline_government_id', 'sent_id', 'kba', 'selfie', 'government_id']
## 5                                                       ['email', 'phone', 'facebook', 'reviews', 'jumio', 'government_id']
## 6                                                                                      ['email', 'phone', 'reviews', 'kba']
##   host_has_profile_pic host_identity_verified property_type       room_type
## 1                    t                      t   Condominium    Private room
## 2                    t                      t     Apartment Entire home/apt
## 3                    t                      t   Condominium    Private room
## 4                    t                      t     Apartment Entire home/apt
## 5                    t                      t     Apartment    Private room
## 6                    t                      t     Apartment Entire home/apt
##   accommodates bathrooms bedrooms beds bed_type number_of_reviews
## 1            1         1        1    1 Real Bed                49
## 2            2         1        1    1 Real Bed                29
## 3            2         1        1    1 Real Bed                11
## 4            2         1        1    1 Real Bed                32
## 5            1         1        1    1 Real Bed               295
## 6            4         2        2    2 Real Bed                74
##   review_scores_rating instant_bookable         cancellation_policy
## 1                   99                f strict_14_with_grace_period
## 2                   97                f strict_14_with_grace_period
## 3                   93                t                    moderate
## 4                   79                f strict_14_with_grace_period
## 5                   93                t strict_14_with_grace_period
## 6                   98                f strict_14_with_grace_period
##   reviews_per_month   price
## 1              0.59  $70.00
## 2              0.37 $115.00
## 3              0.71  $75.00
## 4              0.52  $72.00
## 5              3.90  $75.00
## 6              0.97 $125.00

Cleaning

num_NA <- integer(0) # the number of empty values in each column
for (c in 1:ncol(westwood)){
  westwood[which(westwood[,c]=="N/A"|westwood[,c]==""),c] <- NA
  if (!is.null(levels(westwood[,c]))){
    levels(westwood[,c])[which(levels(westwood[,c])=="N/A"|levels(westwood[,c])=="")] <- NA
  }
  num_NA[c] <- sum(is.na(westwood[,c]))
}


westwood[,4] <- as.numeric(str_extract(westwood[,4],"\\d+"))/100 # percent

westwood[,5] <- as.numeric(str_extract(westwood[,5],"\\d+"))/100 # percent

westwood[,6] <- ifelse(westwood[,6]=="f",0,1) # logical vector: 0 for FALSE, 1 for TRUE

# count for number of verifications
temp <- integer(530)
for (i in 1:length(westwood[,9])){
  temp[i] <- length(str_extract_all(westwood[,9][i],",")[[1]])+1
}
westwood[,9] <- temp

westwood[,10] <- ifelse(westwood[,10]=="f",0,1) # logical vector: 0 for FALSE, 1 for TRUE

westwood[,11] <- ifelse(westwood[,11]=="f",0,1) # logical vector: 0 for FALSE, 1 for TRUE

westwood[,21] <- ifelse(westwood[,21]=="f",0,1) # logical vector: 0 for FALSE, 1 for TRUE

temp <- substr(westwood[,24],2,nchar(as.character(westwood[,24])))
westwood[,24] <- as.numeric(str_replace_all(temp, ",", "")) # clean price

# For host_since, convert into days till the last time this dataset being measured (08 May, 2020)
westwood[,2] <- as.numeric(difftime(as.Date("2020-05-08"),as.Date(westwood[,2])))
colnames(westwood)[2] <- "host_has_been"

westwood <- westwood[,-7] # remove host_neighbourhood

westwood <- westwood[,-c(1,3,4,5,11)]

westwood$room_type<-factor(westwood$room_type)
westwood$bed_type<-factor(westwood$bed_type)
westwood$cancellation_policy<-factor(westwood$cancellation_policy)
westwood$host_is_superhost<-factor(westwood$host_is_superhost)
westwood$host_has_profile_pic<-factor(westwood$host_has_profile_pic)
westwood$host_identity_verified<-factor(westwood$host_identity_verified)
westwood$instant_bookable<-factor(westwood$instant_bookable)

head(westwood) # make sure data is cleaned
##   host_has_been host_is_superhost host_total_listings_count host_verifications
## 1          2606                 0                         1                  5
## 2          3103                 0                         1                  9
## 3          2467                 0                         1                  6
## 4          2811                 0                        40                  8
## 5          2313                 0                         1                  6
## 6          2312                 0                         1                  4
##   host_has_profile_pic host_identity_verified       room_type accommodates
## 1                    1                      1    Private room            1
## 2                    1                      1 Entire home/apt            2
## 3                    1                      1    Private room            2
## 4                    1                      1 Entire home/apt            2
## 5                    1                      1    Private room            1
## 6                    1                      1 Entire home/apt            4
##   bathrooms bedrooms beds bed_type number_of_reviews review_scores_rating
## 1         1        1    1 Real Bed                49                   99
## 2         1        1    1 Real Bed                29                   97
## 3         1        1    1 Real Bed                11                   93
## 4         1        1    1 Real Bed                32                   79
## 5         1        1    1 Real Bed               295                   93
## 6         2        2    2 Real Bed                74                   98
##   instant_bookable         cancellation_policy reviews_per_month price
## 1                0 strict_14_with_grace_period              0.59    70
## 2                0 strict_14_with_grace_period              0.37   115
## 3                1                    moderate              0.71    75
## 4                0 strict_14_with_grace_period              0.52    72
## 5                1 strict_14_with_grace_period              3.90    75
## 6                0 strict_14_with_grace_period              0.97   125

Heads up: 1. these variables have more than 20% NA values: “host_response_time” “host_response_rate” “review_scores_rating” “reviews_per_month” 2. these varaibles have relatively high (10%-20%) NA values: “host_acceptance_rate” 3. Variable 12) there are 13 levels (simplify? Might be too long and overwhelming in the model).

Model fitting outline

full_model<-lm(price~.,data=westwood)
plot(full_model)
## Warning: not plotting observations with leverage one:
##   16, 32

plot(price~.,data=westwood)

#mmps(full_model)
  1. model transformation (use all possible predictor variables)

Look at diagnostic plots + Marginal Model Plot for each transformation 1) Transform Y and keep the predictors the same (Inverse Reg + Box Cox) 2) Keep Y and transform the predictors (Box Cox) 3) Transform both Y and the X’s (Box Cox)

#the following code is a template, need to plug in actual values/variables 

#Step 1: Transform Y

#inverse regression 
library(alr3)
## Loading required package: car
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
invResPlot(full_model)

##         lambda     RSS
## 1 -0.002650926 4938471
## 2 -1.000000000 6611056
## 3  0.000000000 4938502
## 4  1.000000000 9296745

The inverse response plot approach suggests that the best transformation for the response variable is log(Y).

#box cox
summary(powerTransform(full_model))
## bcPower Transformation to Normality 
##    Est Power Rounded Pwr Wald Lwr Bnd Wald Upr Bnd
## Y1   -0.3365       -0.33      -0.4137      -0.2593
## 
## Likelihood ratio test that transformation parameter is equal to 0
##  (log transformation)
##                            LRT df       pval
## LR test, lambda = (0) 87.78134  1 < 2.22e-16
## 
## Likelihood ratio test that no transformation is needed
##                            LRT df       pval
## LR test, lambda = (1) 1805.706  1 < 2.22e-16

The box-cox approach suggests that the best transformation of the response variable is Y^-0.33.

#log(Y)
library(dplyr)
westwood_t1<-mutate(westwood, logprice=log(price))
westwood_t1<-westwood_t1[,-18]
plot(logprice~.,data=westwood_t1)

log_model<-lm(logprice~.,data=westwood_t1)
mmps(log_model)

## Warning in mmps(log_model): Interactions and/or factors skipped

par(mfrow=c(2,2))
plot(log_model)
## Warning: not plotting observations with leverage one:
##   16, 32

#Y^-(1/3)
westwood_t2<-mutate(westwood, tprice=price^-(1/3))
westwood_t2<-westwood_t2[,-18]
plot(tprice~.,data=westwood_t2)

par(mfrow=c(2,2))
y2_model<-lm(tprice~.,data=westwood_t2)
plot(y2_model)
## Warning: not plotting observations with leverage one:
##   16, 32

mmps(y2_model)

## Warning in mmps(y2_model): Interactions and/or factors skipped